{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 多元线性回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "class LinearRegression:\n",
    "        \n",
    "        def __init__(self):\n",
    "            self.coef_ = None\n",
    "            self.interception_ = None\n",
    "            self._theta = None\n",
    "        \n",
    "        def fit_normal(self, X_train, Y_train):\n",
    "            X_b = np.hstack([np.ones((len(X_train),1)),X_train])\n",
    "            self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(Y_train)\n",
    "            self.interception_ = self._theta[0]\n",
    "            self.coef_ = self._theta[1:]\n",
    "            return self\n",
    "        def predict(self, X_predict):\n",
    "            X_b = np.hstack([np.ones((len(X_predict),1)),X_predict])\n",
    "            return X_b.dot(self._theta)\n",
    "        def score(self, X_test, Y_test):\n",
    "            y_predict = self.predict(X_test)\n",
    "            return mean_squared_error(Y_test,y_predict) / np.var(Y_test)\n",
    "        def __repr__(self):\n",
    "            return \"LinearRegression\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from sklearn import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "boston = datasets.load_boston()\n",
    "X = boston.data\n",
    "y = boston.target\n",
    "\n",
    "X = X[y< 50.0]\n",
    "y = y[y< 50.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(490, 13)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_test_split(X, y, test_ratio=0.2, seed=None):\n",
    "    \"\"\"将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test\"\"\"\n",
    "    assert X.shape[0] == y.shape[0], \\\n",
    "        \"the size of X must be equal to the size of y\"\n",
    "    assert 0.0 <= test_ratio <= 1.0, \\\n",
    "        \"test_ration must be valid\"\n",
    "\n",
    "    if seed:\n",
    "        np.random.seed(seed)\n",
    "\n",
    "    shuffled_indexes = np.random.permutation(len(X))\n",
    "\n",
    "    test_size = int(len(X) * test_ratio)\n",
    "    test_indexes = shuffled_indexes[:test_size]\n",
    "    train_indexes = shuffled_indexes[test_size:]\n",
    "\n",
    "    X_train = X[train_indexes]\n",
    "    y_train = y[train_indexes]\n",
    "\n",
    "    X_test = X[test_indexes]\n",
    "    y_test = y[test_indexes]\n",
    "\n",
    "    return X_train, X_test, y_train, y_test\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train,y_test = train_test_split(X,y,seed=1218)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg = LinearRegression()\n",
    "reg.fit_normal(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1.14134206e-01,  5.04529792e-02, -3.97788554e-02,  4.01149967e-01,\n",
       "       -1.51498843e+01,  3.20993522e+00, -1.81100112e-02, -1.43836339e+00,\n",
       "        2.49439919e-01, -1.32532791e-02, -8.73930943e-01,  8.36747233e-03,\n",
       "       -3.68601735e-01])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.23350976299658943"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg.score(X_test,y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "lin_reg = LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lin_reg.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1.14134206e-01,  5.04529792e-02, -3.97788554e-02,  4.01149967e-01,\n",
       "       -1.51498843e+01,  3.20993522e+00, -1.81100112e-02, -1.43836339e+00,\n",
       "        2.49439919e-01, -1.32532791e-02, -8.73930943e-01,  8.36747233e-03,\n",
       "       -3.68601735e-01])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lin_reg.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38.01795508332663"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lin_reg.intercept_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7664902370034031"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lin_reg.score(X_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5454438928152348"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "knn_reg = KNeighborsRegressor()\n",
    "knn_reg.fit(X_train,y_train)\n",
    "knn_reg.score(X_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 60 candidates, totalling 300 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s\n",
      "[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.6s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=None, error_score=nan,\n",
       "             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,\n",
       "                                           metric='minkowski',\n",
       "                                           metric_params=None, n_jobs=None,\n",
       "                                           n_neighbors=5, p=2,\n",
       "                                           weights='uniform'),\n",
       "             iid='deprecated', n_jobs=-1,\n",
       "             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
       "                          'weights': ['uniform']},\n",
       "                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
       "                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring=None, verbose=1)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#网格搜索\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "param_grid = [\n",
    "    {\n",
    "        \"weights\":[\"uniform\"],\n",
    "        \"n_neighbors\":[i for i in range(1,11)]\n",
    "    },\n",
    "    {\n",
    "        \"weights\":[\"distance\"],\n",
    "        \"n_neighbors\":[i for i in range(1,11)],\n",
    "        \"p\":[i for i in range(1,6)]\n",
    "    }\n",
    "]\n",
    "\n",
    "knn_reg = KNeighborsRegressor()\n",
    "grid_search = GridSearchCV(knn_reg,param_grid, n_jobs=-1, verbose=1)\n",
    "grid_search.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'n_neighbors': 8, 'p': 1, 'weights': 'distance'}"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6395337389045348"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6139006951349414"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_estimator_.score(X_test,y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 线性回归可解释性\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
